All the installations are commented out so it do not install everytime
#install.packages("ggmap")
#install.packages("leaflet")
#install.packages("xts")
#install.packages("rgdal")
#install.packages("sf")
library(corrplot)
## corrplot 0.92 loaded
library(ggcorrplot)
## Loading required package: ggplot2
library(ggmap)
## ℹ Google's Terms of Service: ]8;;https://mapsplatform.google.com<https://mapsplatform.google.com>]8;;
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(leaflet)
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ################################### WARNING ###################################
## # We noticed you have dplyr installed. The dplyr lag() function breaks how #
## # base R's lag() function is supposed to work, which breaks lag(my_xts). #
## # #
## # If you call library(dplyr) later in this session, then calls to lag(my_xts) #
## # that you enter or source() into this session won't work correctly. #
## # #
## # All package code is unaffected because it is protected by the R namespace #
## # mechanism. #
## # #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## # You can use stats::lag() to make sure you're not using dplyr::lag(), or you #
## # can add conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## ################################### WARNING ###################################
##
## Attaching package: 'xts'
## The following object is masked from 'package:leaflet':
##
## addLegend
library(rgdal)
## Loading required package: sp
## Please note that rgdal will be retired during 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## See https://r-spatial.org/r/2022/04/12/evolution.html and https://github.com/r-spatial/evolution
## rgdal: version: 1.6-5, (SVN revision 1199)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.4.2, released 2022/03/08
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ runtime: Rel. 8.2.1, January 1st, 2022, [PJ_VERSION: 821]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.6-0
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
library(ggplot2)
library(Rcpp)
library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.0
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks xts::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks xts::last()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(lubridate)
library(dplyr)
library(forcats)
Let’s start by reading the data and check how many rows and columns are available
data <- read.csv("37-00049_UOF-P_2016_prepped.csv", header = TRUE)
data <- data[-1,]
dim(data)
## [1] 2383 47
head(data)
## INCIDENT_DATE INCIDENT_TIME UOF_NUMBER OFFICER_ID OFFICER_GENDER
## 2 9/3/16 4:14:00 AM 37702 10810 Male
## 3 3/22/16 11:00:00 PM 33413 7706 Male
## 4 5/22/16 1:29:00 PM 34567 11014 Male
## 5 1/10/16 8:55:00 PM 31460 6692 Male
## 6 11/8/16 2:30:00 AM 37879, 37898 9844 Male
## 7 9/11/16 7:20:00 PM 36724 9855 Male
## OFFICER_RACE OFFICER_HIRE_DATE OFFICER_YEARS_ON_FORCE OFFICER_INJURY
## 2 Black 5/7/14 2 No
## 3 White 1/8/99 17 Yes
## 4 Black 5/20/15 1 No
## 5 Black 7/29/91 24 No
## 6 White 10/4/09 7 No
## 7 White 6/10/09 7 No
## OFFICER_INJURY_TYPE OFFICER_HOSPITALIZATION SUBJECT_ID SUBJECT_RACE
## 2 No injuries noted or visible No 46424 Black
## 3 Sprain/Strain Yes 44324 Hispanic
## 4 No injuries noted or visible No 45126 Hispanic
## 5 No injuries noted or visible No 43150 Hispanic
## 6 No injuries noted or visible No 47307 Black
## 7 No injuries noted or visible No 46549 White
## SUBJECT_GENDER SUBJECT_INJURY SUBJECT_INJURY_TYPE
## 2 Female Yes Non-Visible Injury/Pain
## 3 Male No No injuries noted or visible
## 4 Male No No injuries noted or visible
## 5 Male Yes Laceration/Cut
## 6 Male No No injuries noted or visible
## 7 Female No No injuries noted or visible
## SUBJECT_WAS_ARRESTED SUBJECT_DESCRIPTION SUBJECT_OFFENSE
## 2 Yes Mentally unstable APOWW
## 3 Yes Mentally unstable APOWW
## 4 Yes Unknown APOWW
## 5 Yes FD-Unknown if Armed Evading Arrest
## 6 Yes Unknown Other Misdemeanor Arrest
## 7 Yes Unknown Assault/FV
## REPORTING_AREA BEAT SECTOR DIVISION LOCATION_DISTRICT STREET_NUMBER
## 2 2062 134 130 CENTRAL D14 211
## 3 1197 237 230 NORTHEAST D9 7647
## 4 4153 432 430 SOUTHWEST D6 716
## 5 4523 641 640 NORTH CENTRAL D11 5600
## 6 2167 346 340 SOUTHEAST D7 4600
## 7 1134 235 230 NORTHEAST D9 1234
## STREET_NAME STREET_DIRECTION STREET_TYPE
## 2 Ervay N St.
## 3 Ferguson NULL Rd.
## 4 bimebella dr NULL Ln.
## 5 LBJ NULL Frwy.
## 6 Malcolm X S Blvd.
## 7 Peavy NULL Rd.
## LOCATION_FULL_STREET_ADDRESS_OR_INTERSECTION LOCATION_CITY LOCATION_STATE
## 2 211 N ERVAY ST Dallas TX
## 3 7647 FERGUSON RD Dallas TX
## 4 716 BIMEBELLA LN Dallas TX
## 5 5600 L B J FWY Dallas TX
## 6 4600 S MALCOLM X BLVD Dallas TX
## 7 1234 PEAVY RD Dallas TX
## LOCATION_LATITUDE LOCATION_LONGITUDE INCIDENT_REASON REASON_FOR_FORCE
## 2 32.782205 -96.797461 Arrest Arrest
## 3 32.798978 -96.717493 Arrest Arrest
## 4 32.73971 -96.92519 Arrest Arrest
## 5 Arrest Arrest
## 6 Arrest Arrest
## 7 32.837527 -96.695566 Arrest Arrest
## TYPE_OF_FORCE_USED1 TYPE_OF_FORCE_USED2 TYPE_OF_FORCE_USED3
## 2 Hand/Arm/Elbow Strike
## 3 Joint Locks
## 4 Take Down - Group
## 5 K-9 Deployment
## 6 Verbal Command Take Down - Arm
## 7 Hand Controlled Escort
## TYPE_OF_FORCE_USED4 TYPE_OF_FORCE_USED5 TYPE_OF_FORCE_USED6
## 2
## 3
## 4
## 5
## 6
## 7
## TYPE_OF_FORCE_USED7 TYPE_OF_FORCE_USED8 TYPE_OF_FORCE_USED9
## 2
## 3
## 4
## 5
## 6
## 7
## TYPE_OF_FORCE_USED10 NUMBER_EC_CYCLES FORCE_EFFECTIVE
## 2 NULL Yes
## 3 NULL Yes
## 4 NULL Yes
## 5 NULL Yes
## 6 NULL No, Yes
## 7 NULL Yes
From the above summary of data we can understand that it is policing incident of texas dallas area. It has report of incidents. It has details of officer as well as details of subject. It also contains area details and force used or not. If any force used how many types of force used.
We will mainly focus on incident over time, subject by gender and race and force usage on subject. We will try to explore and analyse gender, race and subject and find relationship between them.
Here date is in string format. We will convert it into date object and make month and hour column. As month and hour will come handy to time series analyse.
data$date <- mdy(data$INCIDENT_DATE)
data$month <- format(data$date, "%m")
data$hour <- strptime(data$INCIDENT_TIME, format = "%I:%M:%S %p")
data$hour <- as.numeric(format(data$hour, "%H"))
table(format(data$date, "%y"))
##
## 16
## 2383
We can see all the data are from year 2016. So, year by year analysis is not feasible. We will focus more on monthly, weekly and hourly incident occurance.
# Count the number of occurrences per day
data_count <- data %>% group_by(date) %>% summarise(count = n())
# Plot the data using ggplot
ggplot(data_count, aes(x = date, y = count)) +
#geom_col() +
geom_line(size=0.5, col="gray") +
geom_smooth(method = "loess", color = "red", span = 1/5) +
scale_x_date(date_breaks = "1 month", date_labels = "%b") +
#theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
labs(x = "Day", y = "Incidents Count", title = "Incident count by Day")+
theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
From the distribuition we can see the crime over the whole year. We can see decrease in incident rate at the end of the year. While it peaked around the March. Incident rate seems to between 4-25 per day. Let’s explore bit more.
summary(data_count$count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 6.000 6.751 9.000 24.000
We can see median incident being 6 and mean being 6.751. Maximum incident in one day is 24 and minimum being 1. However there there could be days where no incident happened which is oviously not in our database. We will explore that bit later.
boxplot(data_count$count,
main = "Incident per day at Texas",
xlab = "Incidents Per Day",
ylab = "",
col = "skyblue",
border = "black",
horizontal = TRUE,
notch = TRUE
)
We can see from the box plot that there is no outlier on the lower end
but there are few outlier on the upper end. Most of the cases it will
not have huge impact on the data.
ggplot(data_count, aes(count) ) +
geom_density(alpha = 0.5, colour = "black", fill ="skyblue")+ labs(x="Incidents count per day", y= "Density", title="Distribuion of incidents per day") +
theme_bw()
From the density plot, we can see most common occurance of crime is 3 to
5 per day. There are very few value at the higher end of the
distribution.
month <- as.Date(cut(data$date, breaks = "month"))
df <- data.frame(month,data$date)
data_count <- df %>% group_by(month) %>% summarise(count = n())
ggplot(data_count, aes(x = month, y = count)) +
geom_col(alpha = 0.5, colour = "black", fill ="skyblue") +
scale_x_date(date_breaks = "1 month", date_labels = "%b") +
labs(x = "Month", y = "Incidents Count", title = "Incidents Count by Month")
Month by distribuition of crime made our previous assumption more clear
that in february and march crime peaked while at the end of the year it
decreased. This is however is no pattern as it is only one year data.
So, it is hard to make yearly assumption out of it.
month <- format(data$date,"%m")
date <- format(data$date, "%d")
df <- data.frame(month,date)
data_count <- df %>% group_by(month, date) %>% summarise(count = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(data_count, aes(x= date, y= month,fill = count)) + geom_tile( ) +
geom_text(aes(date, month, label = count), color = "black", size = 3) + scale_y_discrete("Months",labels=c("January","February", "March", "April","May", "June","July","August", "September","October","November","December")) + labs(x="Days of Month", y= "Months", title=" Incident Rates across Dates and Months")+
scale_fill_gradientn(colours = c("white", "red"))
From day to day crime distribution it isi clear there are some day
without any crime like december 4th. Most of the higher value days are
in the first few months and blank and lower value days are at the last
few months which is quite normal considering previous monthly plots.
weekday <- weekdays(data$date)
df <- data.frame(weekday)
data_count <- df %>% group_by(weekday) %>% summarise(count = n())
# Start date from monday
data_count <- data_count %>%
mutate(weekday = factor(weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
ggplot(data_count, aes(x = weekday, y = count)) +
geom_col(alpha = 0.5, colour = "black", fill ="skyblue") +
labs(x = "Day", y = "Incident Count", title = "Count of Incidents against Weekdays")
From the weekday crime occurance column it seems to be indicating
weekends are most crime prone. Friday as well has more crimes than other
weekdays. It could be because of friday night party as a start of
weekend. Overall sunday has the most incidents. However, saturday and
sunday incident could be more because of few days with more incidents
and thus increase the percentage of the incidents.
df <- data
data_count <- df %>% group_by(date, month) %>% summarise(count = n())
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
data_count$weekday <- weekdays(data_count$date)
table(data_count$month)
##
## 01 02 03 04 05 06 07 08 09 10 11 12
## 31 29 30 30 30 30 29 31 29 29 27 28
data_count$day <- as.numeric(day(data_count$date))
table(data_count$day)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 12 12 12 10 12 11 12 11 11 12 12 12 11 12 11 12 12 12 12 12 12 12 12 11 12 12
## 27 28 29 30 31
## 12 10 10 10 7
data_count %>%
#filter(month %in% c("01")) %>%
ggplot(aes(x=day,y=count)) +
geom_point(aes(color=weekday),size=4) +
geom_line(aes(group=1),linetype='dotted') +
theme_bw() +
labs(fill='Weekdays') +
#scale_x_continuous("Day of the month", labels = as.character(data_count$day), breaks = data_count$day) +
facet_wrap(~month,nrow=6, scales = "free" )
From the weekday distribuition it is clear most friday, saturday and
sunday has more crime than other days of the week. It is consistant
about that over the year.
# Count the number of occurrences per day
data_count <- data %>% group_by(hour) %>% summarise(count = n())
data_count <- data_count[!is.na(data_count$hour),]
# Plot the data using ggplot
ggplot(data_count, aes(x = factor(hour), y = count)) +
geom_col(alpha = 0.5, colour = "black", fill ="skyblue") +
labs(x = "Hour in day", y = "Incidents Count", title = "Incidents Count by Hour in day") +
theme_bw()
As we can see incidents occurred more at night. From 5pm to 9pm is the
more the more crime prone. However this could increased by some specific
occurance of major incidents in that time period. Let’s explore it
more.
data_count <- data %>% group_by(date, hour) %>% summarise(count = n())
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
data_count <- data_count[!is.na(data_count$hour),]
ggplot(data_count, aes(x = factor(hour), y = count)) +
geom_point(alpha = 0.1, colour = "red", aes(size=factor(count))) +
labs(x = "Hour in day", y = "Incidents Count", title = "Incidents in a day by Hour") +
theme_bw() +
labs(size="Incident Count")
## Warning: Using size for a discrete variable is not advised.
We divide the data by incidents in a day by hour. We set low alpha value
to determine overlap. From the above graph it is more clear that
incidents in the night is not random incidents. It is more during 5pm to
9pm as overlapping made it more solid color as well as we see higher
incidents in that interval.
getCategoryPercentages <- function(cat_var) {
# calculate the number of observations in each category
cat_counts <- table(cat_var)
# calculate the percentage of observations in each category
cat_percentages <- prop.table(cat_counts) * 100
# return the category percentages
return(cat_percentages)
}
getCategoryPercentages(data$SUBJECT_RACE)
## cat_var
## American Ind Asian Black Hispanic NULL Other
## 0.04196391 0.20981956 55.93789341 21.98908938 1.63659253 0.46160302
## White
## 19.72303819
There are 3 main races in the subject. ‘Black’ being the majority, followed by ‘Hispanic’ and ‘White’. Let’s ignore the other races for now.
data[!is.na(data$hour),] %>%
filter(SUBJECT_RACE %in% c('Black','White', 'Hispanic')) %>%
count(SUBJECT_RACE, hour) %>%
ggplot(aes(x=factor(hour),y=n,color=SUBJECT_RACE,group=SUBJECT_RACE)) +
geom_point() +
geom_line(linetype='dotted') +
labs(x = "Hour in a day", y = "Incidents Count", title = "Incidents in a day by Hour against race")+
theme_bw() +
scale_color_discrete(name = "Race")
Crime pattern seems similar over the day for all the three races.
df <- data
df$weekday <- weekdays(df$date)
df <- df %>%
mutate(weekday = factor(weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
df %>%
filter(SUBJECT_RACE %in% c('Black','White', 'Hispanic')) %>%
count(SUBJECT_RACE, weekday) %>%
ggplot(aes(x=factor(weekday),y=n,color=SUBJECT_RACE,group=SUBJECT_RACE)) +
geom_point() +
geom_line(linetype='dotted') +
labs(x = "Weekdays", y = "Incidents Count", title = "Incidents in a day against Weekdays by race")+
theme_bw() +
scale_color_discrete(name = "Race")
‘Black’ subject tends to commit more crime on friday. ‘Hispanic’ subject
tend to commit more crime on sunday. Overall, all races commit more
crime on weekends and friday. However, ‘Hispanic’ subjects crime on
sunday is too much compared to other days. Let’s explore more.
data_count <- df %>% group_by(date, weekday, SUBJECT_RACE) %>% summarise(count = n())
## `summarise()` has grouped output by 'date', 'weekday'. You can override using
## the `.groups` argument.
data_count <- data_count %>% group_by(weekday, SUBJECT_RACE) %>% summarise(med = median(count))
## `summarise()` has grouped output by 'weekday'. You can override using the
## `.groups` argument.
data_count %>%
filter(SUBJECT_RACE %in% c('Black','White', 'Hispanic')) %>%
ggplot(aes(x=factor(weekday),y=med,color=SUBJECT_RACE,group=SUBJECT_RACE)) +
geom_point() +
geom_line(linetype='dotted') +
labs(x = "Weekdays", y = "Incidents Count", title = "Median Incidents against Weekdays by race")+
theme_bw()+
scale_color_discrete(name = "Race")
We are looking into median incidents by race. Sunday behaviour still
holds for ‘Hispanic’ subjects. ‘Black’ subjects are still commit more
crimes.
map_data <- data[!is.na(data$LOCATION_LATITUDE), ]
map_data <- map_data[!is.na(data$LOCATION_LONGITUDE), ]
map_data$LOCATION_LATITUDE = as.numeric(map_data$LOCATION_LATITUDE)
map_data$LOCATION_LONGITUDE = as.numeric(map_data$LOCATION_LONGITUDE)
names(map_data)[32] <- "lat"
names(map_data)[33] <- "lon"
map <- map_data %>% leaflet() %>%
addTiles(group = "OSM (default)") %>%
addProviderTiles(providers$Stamen.TonerLite, group = "Toner Lite") %>%
setView(-96.78,32.8,zoom = 10) %>%
addCircles(data = map_data[map_data$SUBJECT_RACE=="Black",], group = "Black", color = 'black', label=map_data$SUBJECT_OFFENSE)%>%
addCircles(data = map_data[map_data$SUBJECT_RACE=="Hispanic",], group = "Hispanic",color='blue',label=map_data$SUBJECT_OFFENSE)%>%
addCircles(data = map_data[map_data$SUBJECT_RACE=="White",], group = "White",color="red", label=map_data$SUBJECT_OFFENSE)
## Assuming "lon" and "lat" are longitude and latitude, respectively
## Warning in validateCoords(lng, lat, funcName): Data contains 10 rows with
## either missing or invalid lat/lon values and will be ignored
## Assuming "lon" and "lat" are longitude and latitude, respectively
## Warning in validateCoords(lng, lat, funcName): Data contains 8 rows with either
## missing or invalid lat/lon values and will be ignored
## Assuming "lon" and "lat" are longitude and latitude, respectively
## Warning in validateCoords(lng, lat, funcName): Data contains 32 rows with
## either missing or invalid lat/lon values and will be ignored
map%>% addLayersControl(
baseGroups = c("OSM (default)", "Toner Lite"),
overlayGroups = c("Black","White","Hispanic"),
options = layersControlOptions(collapsed = TRUE)) %>%
leaflet::addLegend(
position = "bottomright",
colors = c('black', 'red','blue'),
labels = c('Black','White','Hispanic'), opacity = 1,
title = "Race"
)
From the data we can observe that white subject incidents are more common in centre of the map and on the upper portion. There are low crimes on lower portion of the map commited by white subjects.
On the other hand black subject crimes are more common in the centre and lower part of the map.
Hispanic crimes are spread all over the map. But, it got little bit more crime on the left side of the map.
Overall centre of the map got more crime followed by lower left portion. Upper left portion of the map contains least crime.
freq_table <- table(data$SUBJECT_OFFENSE)
filtered_df <- data[data$SUBJECT_OFFENSE %in% names(freq_table[freq_table >= 20]), ]
filtered_df <- filtered_df %>%
filter(SUBJECT_RACE %in% c('Black','White', 'Hispanic'))
data_count <- filtered_df %>% group_by(SUBJECT_OFFENSE, SUBJECT_RACE) %>% summarise(count = n())
## `summarise()` has grouped output by 'SUBJECT_OFFENSE'. You can override using
## the `.groups` argument.
ggplot(data_count, aes(x = fct_rev(fct_reorder(SUBJECT_OFFENSE,count)), y = count))+
geom_col( aes(fill = SUBJECT_RACE), width = 0.7) +
labs(x = "Type of Incidents", y = "Incidents Count", title = "Incident count against type of incidents by Race")+
theme_bw()+
labs(fill='Race')+
coord_flip()
Overall APOWW is the most common incident followed by No Arrest and
Public intoxication. Warrant and Assault are the other common incidents.
White subject have bigger portion in public intoxication compared to
other race. On the otherhand Black race have bigger portion for warrant
and APOWW. Hispanic have bigger portion on No Arrest. When we mention
bigger portion it is compare to their size in the database.
freq_table <- table(data$SUBJECT_OFFENSE)
filtered_df <- data[data$SUBJECT_OFFENSE %in% names(freq_table[freq_table >= 20]), ]
filtered_df <- filtered_df %>%
filter(SUBJECT_RACE %in% c('Black')) %>%
filter(SUBJECT_GENDER %in% c('Male', 'Female'))
data_count <- filtered_df %>% group_by(SUBJECT_OFFENSE, SUBJECT_GENDER) %>% summarise(count = n())
## `summarise()` has grouped output by 'SUBJECT_OFFENSE'. You can override using
## the `.groups` argument.
ggplot(data_count, aes(x = fct_rev(fct_reorder(SUBJECT_OFFENSE,count)), y = count))+
geom_col( aes(fill = SUBJECT_GENDER), width = 0.7) +
labs(x = "Type of Incidents", y = "Incidents Count", title = "Incident count against type of incidents by Race")+
theme_bw()+
labs(fill='Gender')+
coord_flip()
APOWW and No arrest are the most common incidents among black subject.
Other then that Warant, public intoxication and Assault are quite
common. For APOWW, we can see more female has bigger portion compare to
other crime.
freq_table <- table(data$SUBJECT_OFFENSE)
filtered_df <- data[data$SUBJECT_OFFENSE %in% names(freq_table[freq_table >= 20]), ]
filtered_df <- filtered_df %>%
filter(SUBJECT_RACE %in% c('Hispanic')) %>%
filter(SUBJECT_GENDER %in% c('Male', 'Female'))
data_count <- filtered_df %>% group_by(SUBJECT_OFFENSE, SUBJECT_GENDER) %>% summarise(count = n())
## `summarise()` has grouped output by 'SUBJECT_OFFENSE'. You can override using
## the `.groups` argument.
ggplot(data_count, aes(x = fct_rev(fct_reorder(SUBJECT_OFFENSE,count)), y = count))+
geom_col( aes(fill = SUBJECT_GENDER), width = 0.7) +
labs(x = "Type of Incidents", y = "Incidents Count", title = "Incident count against type of incidents by Race")+
theme_bw()+
labs(fill='Gender')+
coord_flip()
No arrest, APOWW and Public intoxication are the most common incidents
among hispanic subject. Other then that Warant, public servant and
Assault are quite common. For APOWW, we can see more female has bigger
portion compare to other crime.
freq_table <- table(data$SUBJECT_OFFENSE)
filtered_df <- data[data$SUBJECT_OFFENSE %in% names(freq_table[freq_table >= 20]), ]
filtered_df <- filtered_df %>%
filter(SUBJECT_RACE %in% c('White')) %>%
filter(SUBJECT_GENDER %in% c('Male', 'Female'))
data_count <- filtered_df %>% group_by(SUBJECT_OFFENSE, SUBJECT_GENDER) %>% summarise(count = n())
## `summarise()` has grouped output by 'SUBJECT_OFFENSE'. You can override using
## the `.groups` argument.
ggplot(data_count, aes(x = fct_rev(fct_reorder(SUBJECT_OFFENSE,count)), y = count))+
geom_col( aes(fill = SUBJECT_GENDER), width = 0.7) +
labs(x = "Type of Incidents", y = "Incidents Count", title = "Incident count against type of incidents by Race")+
theme_bw()+
labs(fill='Gender')+
coord_flip()
APOWW, Public intoxication and No arrest are the most common incidents
among White subject. Other then that Warant, public servant and Burglary
are quite common. For APOWW and public intoxication, we can see more
female has bigger portion compare to other crime.
set.seed(1234)
# split the character column by comma
data$NO_OF_FORCE <- sapply(strsplit(as.character(data$FORCE_EFFECTIVE), ","), length)
#data$NO_OF_FORCE <- as.factor(data$NO_OF_FORCE)
# convert the new column to numeric
#data$NO_OF_FORCE <- as.numeric(data$NO_OF_FORCE)
table(data$NO_OF_FORCE)
##
## 1 2 3 4 5 6 7 8 10
## 747 763 486 230 96 39 17 4 1
# create a histogram
ggplot(data, aes(NO_OF_FORCE)) +
geom_histogram(fill = "skyblue", color = "black", bins = 10) +
labs(x = "Date", y = "Count", title = "Count of Day by Date") +
xlab("NO_OF_FORCE")+
scale_x_continuous("Type of Forces", labels = as.character(data$NO_OF_FORCE), breaks = data$NO_OF_FORCE)+
labs(x = "Number of Force", y = "Incidents Count", title = "Incident count against Number of Force")
filtered_df <- data %>%
filter(SUBJECT_RACE %in% c('White','Black', 'Hispanic'))
data_count <- filtered_df %>% group_by(NO_OF_FORCE, SUBJECT_RACE) %>% summarise(count = n())
## `summarise()` has grouped output by 'NO_OF_FORCE'. You can override using the
## `.groups` argument.
ggplot(data_count, aes(x = as.factor(NO_OF_FORCE), y = count))+
geom_col( aes(fill = SUBJECT_RACE), width = 0.7) +
labs(x = "Type of Incidents", y = "Incidents Count", title = "Incident count against type of incidents by Race")+
theme_bw()+
labs(fill='Gender')+
coord_flip()
filtered_df <- data %>%
filter(SUBJECT_GENDER %in% c('Male','Female'))
data_count <- filtered_df %>% group_by(NO_OF_FORCE, SUBJECT_GENDER) %>% summarise(count = n())
## `summarise()` has grouped output by 'NO_OF_FORCE'. You can override using the
## `.groups` argument.
ggplot(data_count, aes(x = as.factor(NO_OF_FORCE), y = count))+
geom_col( aes(fill = SUBJECT_GENDER), width = 0.7) +
labs(x = "Type of Incidents", y = "Incidents Count", title = "Incident count against type of incidents by Race")+
theme_bw()+
labs(fill='Race')+
coord_flip()
As we can see two type of force more used in female compare to their
population size. Male population face more type of forces. We can see a
increase overall when population is male with respect to force
number.
I took help from below resources
https://www.kaggle.com/code/shivamb/4-3-analysis-report-officer-level-analysis https://www.kaggle.com/code/yashedpotatoes/tidying-acs-data-in-r-and-python https://www.kaggle.com/code/araraonline/austin-use-of-force-eda https://www.kaggle.com/code/vincentkr18/eda-time-series-analysis-policing-equity https://epirhandbook.com/en/ggplot-basics.html https://towardsdatascience.com/how-to-create-a-correlation-matrix-with-too-many-variables-309cc0c0a57 http://lab.rady.ucsd.edu/sawtooth/business_analytics_in_r/Viz1.html